我的github地址:GitHub - yuyongsheng1990/python_spider_from_bdbaike

# -*- coding: UTF-8 -*- # @Project -> File: python_spider_from_bdbaike -> spider_baike_text_picture # @Time: 2021/6/3 20:13 # @Description: 从百度百科爬取人物的基本信息、信息框数据和图片 import os from urllib.request import urlretrieve import urllib.parse from urllib.error import HTTPError import requests from bs4 import BeautifulSoup from lxml import etree import re import xlwt import xlrd from xlutils.copy import copy # 防止ssl报错 import ssl ssl._create_default_https_context = ssl._create_unverified_context # 爬虫程序 def claw(content): # 访问、下载html网页 url = '' + urllib.parse.quote(content) # 请求地址 # 请求头部,伪造浏览器,防止爬虫被反 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36' } # 利用请求地址和请求头部构造请求对象 req = urllib.request.Request(url=url, headers=headers, method='GET') response = urllib.request.urlopen(req) # 发送请求,获得响应 text ='utf-8') # 读取响应,获得文本 # ---------------------------------------------------------------------------------------------------- # 解析html网页 soup = BeautifulSoup(text, 'lxml') # 创建soup对象,获取html源码 intro_tag = soup.find_all('div', class_="lemma-summary") # 获取百科基本信息列表 name_tag = soup.find_all('dt', class_="basicInfo-item name") # 找到所有dt标签,返回一个标签列表 value_tag = soup.find_all('dd', class_="basicInfo-item value") # 找到所有dd标签,返回一个标签列表 # 处理基本信息:过滤数据,去掉空白 intro_after_filter = [re.sub('\n+', '', item.get_text()) for item in intro_tag] intro_after_filter = [''.join(i.split()) for i in intro_after_filter] # 去除/0a乱码 # 将字符串列表连成字符串并返回 intro_after_filter = ''.join(intro_after_filter) # print(intro_after_filter) # 抽取信息框数据 profile_info = {} namelist = [] valuelist = [] for i in name_tag: # 将所有dt标签内容存入列表 name = i.get_text() name = ''.join(name.split()) # 去除/0a乱码 namelist.append(name) for i in value_tag: # 将所有dd标签内容存入列表 value = i.get_text().strip(' ') # value = re.sub('\n+', '、', i.get_text()).strip('、') # 老师不让删除换行符 # value = ''.join(value.split()) # 删除可能存在的乱吗/0a,但一块把空格删除了,实际上不需要 print(value) valuelist.append(value) for i, j in zip(namelist, valuelist): # 多遍历循环,zip()接受一系列可迭代对象作为参数,将对象中对应的元素打包成一个个tuple(元组),然后返回由这些tuples组成的list(列表)。 profile_info[i] = j # print(profile_info) # 爬取图片 # 找到所有img标签,返回一个url的标签列表 img_urllist = [] resp = requests.get(url=url, headers=headers) content = resp.content soup = BeautifulSoup(content, 'lxml') # img_list ='div .album-wrap') img_list ='a>div>img') # print(img_list) for img in img_list: try: # src = img.find('img').get('src') src = img.get('src') if re.match(r'https:(.*)image(.*)auto$', src): img_urllist.append(src) except: continue # print(img_urllist) return intro_after_filter, profile_info, img_urllist # 下载爬到的数据:基本信息、信息框、图片 def download(name, intro, profile_dict, img_list): project_path = os.getcwd() # print('project_path:' + project_path) # 保存百科基本信息 if not os.path.exists('introduction'): os.mkdir('introduction') introduction_file = project_path + '/introduction/' + name + '.txt' # print(introduction_file) if not os.path.exists(introduction_file): with open(introduction_file, 'x') as f: f.write(intro) else: with open(introduction_file, 'w') as f: f.write(intro) # print('introduction输出完毕') # 保存信息框数据到excel if not os.path.exists('profile'): os.mkdir('profile') profile_file = project_path + '/profile/' + 'profile.csv' field_list = ['中文名', '外文名', '别名', '性别', '学位', '职称', '国籍', '民族', '出生地', '籍贯', '出生日期', '逝世日期', '星座', '血型', '身高','体重', '毕业院校', '职业', '经纪公司', '代表作品', '主要成就', '生肖', '语种', '特长', '粉丝名'] if not os.path.exists(profile_file): workbook = xlwt.Workbook(encoding='utf-8') output_sheet = workbook.add_sheet('profile_sheet', cell_overwrite_ok=True) for i in range(len(field_list)): output_sheet.write(0, i, field_list[i]) rb = xlrd.open_workbook(profile_file) rows_num = rb.sheet_by_name('profile_sheet').nrows # print(rows_num) wb = copy(rb) output_sheet = wb.get_sheet(0) # print(profile) for i in range(len(field_list)): if profile_dict.get(field_list[i]): output_sheet.write(rows_num, i, profile_dict.get(field_list[i])) else: continue os.remove(profile_file) # 保存图片 # 请求头部,伪造浏览器,防止爬虫被反 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36' } download_limit = 10 # 单个人物下载的最大图片数 if not os.path.exists('img'): os.mkdir('img') name_path = project_path + '/img/' + name if not os.path.exists(name_path): os.mkdir(name_path) count = 1 for img_url in img_list: try: response = requests.get(img_url, headers=headers) # 得到访问的网址 content = response.content filename = name_path + '/' + name + '_%s.jpg' % count with open(filename, "wb") as f: # 如果图片质量太差,跳过 if len(content) < 1000: continue f.write(content) # 保存图片 response.close() count += 1 # 每个模特最多只下载download_limit张 if count > download_limit: break except HTTPError as e: # HTTP响应异常处理 print(e.reason) if __name__ == '__main__': trigger = True while (trigger): name = '潘建伟' # input('查询词语:') intro, profile_dict, img_list = claw(name) download(name, intro, profile_dict, img_list) # print("查询结果:%s" % result) trigger = False 2. 人物履历等数据按json格式输出 2.1 json简介

json,通信格式,可读性强,却会添加冗余空白格 --> separator对数据进行压缩

2.2 json.dumps()方法参数 json.dumps()方法:



dist_city={ 1:{ "city_id":01, "city_name":"北京", "area":["城东区","城南区"] }, 2:{ "city_id":2, "city_name":"上海", "area":["浦东区","朝阳区"] } } { "$schema": "", "type": "object", "properties": { "email": { "type": "string" }, "firstName":{ "type": "string" }, "lastName": { "type": "string" }, } } 中文dict编码报错,ensure_ascii = Falseskipkeys。dumps存储dict时,key必须是str,否则TypeError,如果Skipkeys=True-->屏蔽非str的键值对。拒绝json.dumps()方法自动排序,sort_keys=Falsejson.dumps()方法输出自动换行缩进的数据格式,indent=4(值为缩进量) 3. python实现Restful框架的Flask接口

Flask官方教程文档:欢迎来到 Flask 的世界 — Flask中文文档(2.1.x)

python Restful API的Flask开发教程视频:3、Python RESTful API 开发_哔哩哔哩_bilibili 

# 防止Flask实现的restful接口返回中文乱码 app.config['JSON_AS_ASCII'] = False 4. bs4读取table表格数据

可以借助pands.read_table()方法=>pandas dataframe数据格式。





